from pymongo import MongoClient
from bs4 import BeautifulSoup
import requests
import re

client = MongoClient('localhost', 27017)
chanpin100_bolg_db = client['zhihu_bolg_db']
chanpin100_blog_collection = chanpin100_bolg_db['chanpin100_blog_collection']

if __name__ == '__main__':
    for i in range(0, 110000):
        url = 'http://www.chanpin100.com/article/' + str(i)
        page_source = requests.get('http://www.chanpin100.com/article/' + str(i)).text

        if page_source.find('啊哦，你所访问的页面不存在了') != -1:
            continue

        soup = BeautifulSoup(page_source, 'lxml')
        title = soup.find(class_ = 'article-title').get_text().strip()

        print(title)
        print()
        content = soup.find(class_ = 'article-content').get_text().strip()
        content = re.sub('\s{2,}', '', content)
        content = re.sub('本文由产品100为你推荐.+', '', content)
        print(content)
        print('-' * 90)

        if not chanpin100_blog_collection.find_one({'url': url}):
            chanpin100_blog_collection.insert_one({
                'url': url,
                'title': title,
                'content': content
            })
